1 Generate Data

Let \(X \sim \sum_i^k \pi_i \mathcal{N}_i(\mu_i, \sigma^2_i I)\). With \(k = 10\) and \(n = 1000\), \(N = nk\).

n <- 1e3
k <- 10
N <- n*k

set.seed(49)
mux <- sort(sample(-20:20, 10))
muy <- sample(-20:20, 10)

(mu0 <- cbind(mux, muy))
#       mux muy
#  [1,] -19 -10
#  [2,] -17  -3
#  [3,] -11 -16
#  [4,] -10   5
#  [5,]  -6  16
#  [6,]  -3 -20
#  [7,]  -1   2
#  [8,]   3 -13
#  [9,]   6  -5
# [10,]  18  -6
(sigma1 <- sample(c(1,1,1,2,3,4,5), 10, replace=TRUE))
#  [1] 1 4 2 4 1 5 5 1 5 1
sigma0 <- lapply(sigma1, function(x) matrix(x*c(1,0,0,1), 2, 2))

Z <- rep(1:k, each = n)

Here are the means

plot(mu0, col = 1:10, pch = 19, cex = 3)

We now generate the data with

X0 <- data.frame(t(sapply(Z, function(x) rmvnorm(1, mu0[x,], sigma0[[x]]))), Z)
names(X0) <- c('x1', 'x2', 'Z')

head(X0[sample(dim(X0)[1]),])
#              x1          x2 Z
# 5564  -1.391007 -20.7115938 6
# 3350 -11.039710   4.7961061 4
# 1000 -19.664400  -9.6635771 1
# 8542   5.388799  -0.2730803 9
# 1467 -13.249741   1.0983979 2
# 3507 -11.265466   2.7168030 4
dat <- as.matrix(X0[, 1:2])

And here are the data colored with the truth.

ggCol <- c('#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6','#6a3d9a')
ggplot(data = X0, aes(x=x1, y=x2, color = as.factor(Z))) +
  scale_color_manual(values = ggCol) + 
  geom_point(alpha = 0.5)

2 Run algorithms

2.1 Run \(jk\)-means

Run with \(jk\)-means with \(j=2, k=10\):

options: max iteration=100 convergence error tolerance = 1E-8

K <- k
j <- 2

set.seed(5431)
set.seed(2^10)
system.time(
  jk <- jkmeans::jkmeansEM(dat, K, j , 100, tol = 1E-8)
)
#    user  system elapsed 
#   0.712   0.004   0.709
jkv <- apply(jk$zeta, 1,  function(x) which( x == max(x) ))

2.2 Run \(k=10\)-means initialized at the truth

system.time({
  kv <- kmeans(dat, centers = mu0)
}
)
#    user  system elapsed 
#   0.004   0.000   0.005

2.3 Run MCLUST

system.time(
  mcv <- Mclust(dat, 
              G = K, 
              modelNames="EII")
)
#    user  system elapsed 
# 177.532   0.948 178.728
mcv
# 'Mclust' model object:
#  best model: spherical, equal volume (EII) with 10 components

3 Results

#            Names       ari
# 1     Truth v km 0.9579922
# 2     Truth v jk 0.8206772
# 3 Truth v mclust 0.9563053
Y <- data.frame(X0, jk=jkv, kv = kv$cluster,
                mc=mcv$classification, kvpp=kvpp$cluster)

p1 <- ggplot(data = Y, aes(x=x1, y=x2)) +
  scale_color_manual(values = ggCol)